Let’s check out the data!
library(readr)
amazon <- read_csv("amazon_jobs_dataset.csv")
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
X1 = col_integer(),
Title = col_character(),
location = col_character(),
Posting_date = col_character(),
DESCRIPTION = col_character(),
`BASIC QUALIFICATIONS` = col_character(),
`PREFERRED QUALIFICATIONS` = col_character()
)
head(amazon)
Questions that we can focus on: - What are the popular locations of job openings? - Are there certain months when job opening are more? - How many jobs are posted every year? - How many jobs are posted every month? - What are most popular job titles? - What are minimum educational qualifications? - Which job requires higher degree? - What languages are required for jobs? - How many years of experience is required? - Which job categories require more experience? - Word clouds of preferred qualifications - Word clouds of Basic Qualifications - Word clouds of JOb description
dim(amazon)
[1] 3493 7
unique(amazon$Posting_date)[1:20]
[1] "March 1, 2018" "February 28, 2018" "February 27, 2018" "February 26, 2018" "February 25, 2018"
[6] "February 24, 2018" "February 23, 2018" "February 22, 2018" "February 21, 2018" "February 20, 2018"
[11] "February 19, 2018" "February 18, 2018" "February 16, 2018" "February 15, 2018" "February 14, 2018"
[16] "February 13, 2018" "February 12, 2018" "February 11, 2018" "February 9, 2018" "February 8, 2018"
We see that in some dates, there are multiple spaces between month and day. Let’s split it to year month and day.
library(stringr)
amazon$posting_month <- str_extract(amazon$Posting_date , "([A-Za-z]+)")
amazon$posting_day <- str_extract(amazon$Posting_date , "([0-9]+)")
amazon$posting_year <- str_extract(amazon$Posting_date, "([0-9]{4})")
unique(amazon$posting_month)
[1] "March" "February" "January" "December" "November" "October" "September" "August" "July"
[10] "June" "May" "April"
unique(amazon$posting_day)
[1] "1" "28" "27" "26" "25" "24" "23" "22" "21" "20" "19" "18" "16" "15" "14" "13" "12" "11" "9" "8" "7" "6"
[23] "5" "4" "3" "2" "31" "30" "29" "17" "10"
unique(amazon$posting_year)
[1] "2018" "2017" "2016" "2015" "2014" "2013" "2012" "2011"
Let’s also extract the countries from location column.
amazon$country <- str_extract(amazon$location, "[A-Za-z]+")
unique(amazon$country)
[1] "US" "IN" "RO" "CA" "DE" "PL" "AE" "ES" "IT" "IL" "UK" "ZA" "SG" "AU" "IE" "FR" "BR" "CN" "MX" "NL" "JO" "TW"
[23] "LU" "JP" NA
unique(amazon$location)
[1] "US, WA, Seattle" "IN, KA, Bangalore" "US, CA, Cupertino" "RO, Ia<U+015F>i"
[5] "US, CA, East Palo Alto" "US, CA, Santa Monica" "US, CA, Sunnyvale" "US, CA, Palo Alto"
[9] "US, MA, Boston" "US, MA, Cambridge" "CA, BC, Vancouver" "US, TX, Austin"
[13] "IN, TS, Hyderabad" "US, CA, San Francisco" "DE, BY, Munich" "RO, Iasi"
[17] "PL, Gdansk" "PL, Bielany Wroclawskie" "DE, Berlin" "US, CA, Irvine"
[21] "CA, ON, Toronto" "US, MA, North Reading" "US, MN, Minneapolis" "AE, Dubai"
[25] "US, CA, San Diego" "US, CA, Santa Barbara" "US, OR, Portland" "ES, Madrid"
[29] "IT, Vercelli" "US, NJ, Newark" "US, NY, New York" "IN, TN, Chennai"
[33] "IL, Herzliya" "DE, Aachen" "US, MA, Westborough" "US, VA, Herndon"
[37] "US, WA, Bellevue" "UK, London" "ZA, Cape Town" "US, MA, Andover"
[41] "US, CO, Boulder" "US, CA, Santa Cruz" "US, CA, San Luis Obispo" "UK, Cambridge"
[45] "SG, Singapore" "AU, NSW, Sydney" "IE, DUBLIN, Dublin" "US, CO, Broomfield"
[49] "FR, Paris" "US, Virtual" "UK, Edinburgh" "US, AZ, Tempe"
[53] "IL, Haifa" "BR, SP, Sao Paulo" "IN, DL, Gurgaon" "IN, HR, Gurgaon"
[57] "CN, Beijing" "MX, EMEX, Mexico City" "NL, The Hague" "US, VA, Ballston"
[61] "IT, MI, Milan" "JO, Amman" "CA, MB, Winnipeg" "US, PA, Pittsburgh"
[65] "CN, Shenzhen" "US, GA, Atlanta" "UK, N YORK, Heslington" "US, NJ, Jersey City"
[69] "TW, TP, Taipei" "US, CO, Denver" "DE, BY, Graben" "CN, Shanghai"
[73] "RO, Bucharest" "DE, SN, Dresden" "LU, Luxembourg" "DE, Standortuebergreifend"
[77] "US, CA, Foster City" "UK, CAMBS, Cambridge" "IL, Hod Hasharon" "JP, Meguro"
[81] "IT, AT, Asti" "IL," "UK, BRIST, Bristol" "CA, ON,"
[85] "IN, MH, Pune" "US, CA, San Jose" "CA," "CA, ON, Ottawa"
[89] NA "US, NV, Las Vegas"
amazon$city <- word(amazon$location, -1)
unique(amazon$city)
[1] "Seattle" "Bangalore" "Cupertino" "Ia<U+015F>i"
[5] "Alto" "Monica" "Sunnyvale" "Boston"
[9] "Cambridge" "Vancouver" "Austin" "Hyderabad"
[13] "Francisco" "Munich" "Iasi" "Gdansk"
[17] "Wroclawskie" "Berlin" "Irvine" "Toronto"
[21] "Reading" "Minneapolis" "Dubai" "Diego"
[25] "Barbara" "Portland" "Madrid" "Vercelli"
[29] "Newark" "York" "Chennai" "Herzliya"
[33] "Aachen" "Westborough" "Herndon" "Bellevue"
[37] "London" "Town" "Andover" "Boulder"
[41] "Cruz" "Obispo" "Singapore" "Sydney"
[45] "Dublin" "Broomfield" "Paris" "Virtual"
[49] "Edinburgh" "Tempe" "Haifa" "Paulo"
[53] "Gurgaon" "Beijing" "City" "Hague"
[57] "Ballston" "Milan" "Amman" "Winnipeg"
[61] "Pittsburgh" "Shenzhen" "Atlanta" "Heslington"
[65] "Taipei" "Denver" "Graben" "Shanghai"
[69] "Bucharest" "Dresden" "Luxembourg" "Standortuebergreifend"
[73] "Hasharon" "Meguro" "Asti" "IL,"
[77] "Bristol" "ON," "Pune" "Jose"
[81] "CA," "Ottawa" NA "Vegas"
Let us correct the city names that have multiple words.
amazon$city <- gsub(pattern = "Alto", replacement = "East Palo Alto", amazon$city)
amazon$city <- gsub(pattern = "Ia<U+015F>i", replacement = "Iasi", amazon$city)
amazon$city <- gsub(pattern = "Monica", replacement = "Santa Monica", amazon$city)
amazon$city <- gsub(pattern = "Francisco", replacement = "San Francisco", amazon$city)
amazon$city <- gsub(pattern = "Wroclawskie", replacement = "Bielany Wroclawskie", amazon$city)
amazon$city <- gsub(pattern = "Diego", replacement = "San Diego", amazon$city)
amazon$city <- gsub(pattern = "Barbara", replacement = "Santa Barbara", amazon$city)
amazon$city <- gsub(pattern = "York", replacement = "New York", amazon$city)
amazon$city <- gsub(pattern = "Town", replacement = "Cape Town", amazon$city)
amazon$city <- gsub(pattern = "Cruz", replacement = "Santa Cruz", amazon$city)
amazon$city <- gsub(pattern = "Obispo", replacement = "San Luis Obispo", amazon$city)
amazon$city <- gsub(pattern = "Paulo", replacement = "Sao Paulo", amazon$city)
amazon$city <- gsub(pattern = "Hasharon", replacement = "Hod Hasharon", amazon$city)
amazon$city <- gsub(pattern = "IL,", replacement = "Hod Hasharon", amazon$city)
amazon$city <- gsub(pattern = "On,", replacement = "Ottawa", amazon$city)
amazon$city <- gsub(pattern = "Jose", replacement = "San Jose", amazon$city)
amazon$city <- gsub(pattern = "CA,", replacement = "Ottawa", amazon$city)
amazon$city <- gsub(pattern = "Vegas", replacement = "Las Vegas", amazon$city)
The word “City” has been used with three different cities, so include them.
amazon$city <- ifelse(amazon$location == "US, CA, Foster City", "Foster City", amazon$city)
amazon$city <- ifelse(amazon$location == "MX, EMEX, Mexico City", "Mexico City", amazon$city)
amazon$city <- ifelse(amazon$location == "US, NJ, Jersey City", "Jersey City", amazon$city)
head(amazon)
What are most popular locations of Job Openings?
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:plyr':
arrange, count, desc, failwith, id, mutate, rename, summarise, summarize
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
location <- amazon[, c("country", "city")]
loc1 <- as.data.frame(table(location$city))
colnames(loc1) <- c("city", "Freq")
loc1 <- merge(loc1,location,by="city")
loc1 <- arrange(loc1, desc(Freq))
loc1 <- loc1 %>% distinct(city, .keep_all = TRUE)
loc1
Loading required package: lattice


Distribution of Jobs in all countries except US
p3 <- ggplot(filter(loc1,loc1$country!="US"), aes(x=factor(country), y=Freq, fill = factor(country))) +
geom_boxplot()
p3

Distribution of jobs in US
p4 <- ggplot(data = filter(loc1, loc1$country == "US"), aes(city, Freq, fill = city)) +
geom_bar(stat = "identity") +
geom_text(aes(label=Freq), color="black", size=3) +
ggtitle("Jobs in US") +
coord_flip() +
guides(fill=FALSE) +
tilt_theme
p4

Analysis of Job Posting Date
How many jobs are posted every year and every month? Is there a certain period of year when jobs are posted more fequently?
amazon$Posting_date <- paste(amazon$posting_year, amazon$posting_month, amazon$posting_day,sep="-")
amazon$Posting_date <- as.Date(strptime(amazon$Posting_date,format="%Y-%b-%d"))
#amazon$Posting_date$zone <- NULL
(amazon$Posting_date[1:10])
[1] "2018-03-01" "2018-03-01" "2018-03-01" "2018-03-01" "2018-03-01" "2018-03-01" "2018-03-01" "2018-03-01"
[9] "2018-03-01" "2018-03-01"
#get day of the week based on date
dow <- function(x) format(as.Date(x), "%A")
amazon$weekday <- dow(amazon$Posting_date)
amazon$weekday[1:20]
[1] "Thursday" "Thursday" "Thursday" "Thursday" "Thursday" "Thursday" "Thursday" "Thursday" "Thursday"
[10] "Thursday" "Thursday" "Thursday" "Thursday" "Wednesday" "Wednesday" "Wednesday" "Wednesday" "Wednesday"
[19] "Wednesday" "Wednesday"
posting_date <- amazon[,c("posting_month", "posting_year", "weekday", "Posting_date")]
head(posting_date)
year <- as.data.frame(table(posting_date$posting_year))
colnames(year) <- c("Year", "Freq")
p5 <- ggplot(data = year, aes(Year, Freq, fill = Year)) +
geom_bar(stat = "identity") +
geom_text(aes(label=Freq), color="black", size=3) +
ggtitle("Jobs postings by Year") +
guides(fill=FALSE) +
tilt_theme
p5

mon <- as.data.frame(table(posting_date$posting_month))
colnames(mon) <- c("Mon", "Freq")
lev = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")
mon$Month <- factor(mon$Mon, lev)
p6 <- ggplot(data = mon, aes(Month, Freq, fill = Month)) +
geom_bar(stat = "identity") +
geom_text(aes(label=Freq), color="black", size=3) +
ggtitle("Jobs postings by Month") +
guides(fill=FALSE) +
tilt_theme
p6

jday <- as.data.frame(table(posting_date$weekday))
colnames(jday) <- c("wd", "Freq")
lev <- c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday")
jday$weekday <- factor(jday$wd, lev)
p7 <- ggplot(data = jday, aes(weekday, Freq, fill = weekday)) +
geom_bar(stat = "identity") +
geom_text(aes(label=Freq), color="black", size=3) +
ggtitle("Jobs postings by Weekday") +
guides(fill=FALSE) +
tilt_theme
p7

The above graphs show that most of the job postings are recent and there has been large increase in jobs in 2017 and 2018. January and february are hot months for jobs. Also, the jobs are posted generally on weekdays, mostly on tuesdays and wednesday. This is true as generally employees are completing more work in the first half of weekdays.
Analysis of Job titles
What are most popular job titles?
title <- arrange(as.data.frame(table(amazon$Title)), desc(Freq))
colnames(title) <- c("Title", "Freq")
p8 <- ggplot(data = head(title, 10), aes(Title, Freq, fill = Title)) +
geom_bar(stat = "identity") +
geom_text(aes(label=Freq), color="black", size=3) +
ggtitle("Top 10 jobs") +
guides(fill=FALSE) +
coord_flip() +
tilt_theme
p8

There are various job postings. Top 10 are shown above. Let us now divide it in different domains and different positions.
# software, game, quality, data, web, security, sale,iot, research
# lead, manager, developer, engineer, consultant, artist, analyst, scientist,
domain <- c("software", "game", "quality", "data", "web", "security", "sale", "research", "iot", "ui")
positions <- c("lead", "leader","manager", "developer", "engineer", "consultant", "artist", "analyst", "scientist")
amazon$Title <- tolower(amazon$Title)
#unique(amazon$Title)
amazon$domain <- NA
amazon$positions <- NA
for(i in domain){
amazon$domain1 <- grepl(i, amazon$Title)
amazon$domain <- ifelse(amazon$domain1 == "TRUE", i, amazon$domain)
}
amazon$domain <- ifelse(amazon$domain == FALSE, "other", amazon$domain)
for(i in positions){
amazon$pos1 <- grepl(i, amazon$Title)
amazon$positions <- ifelse(amazon$pos1 == "TRUE", i, amazon$positions)
}
amazon$positions <- ifelse(is.na(amazon$positions), "other", amazon$positions)
amazon <- subset(amazon, select = -c(domain1, pos1))
pos <- as.data.frame(table(amazon$positions))
colnames(pos) <- c("position", "Freq")
p9 <- ggplot(data = pos, aes(position, Freq, fill = position)) +
geom_bar(stat = "identity") +
geom_text(aes(label=Freq), color="black", size=3) +
ggtitle("Top 10 domains") +
guides(fill=FALSE) +
coord_flip()
p9

dom <- as.data.frame(table(amazon$domain))
colnames(dom) <- c("domain", "Freq")
p9 <- ggplot(data = dom, aes(domain, Freq, fill = domain)) +
geom_bar(stat = "identity") +
geom_text(aes(label=Freq), color="black", size=3) +
ggtitle("Top 10 domains") +
guides(fill=FALSE) +
coord_flip()
p9

What are minimum educational qualifications?
amazon$education <- NA
degree_list = c("ba", "bs", "b.tech","bachelor", "phd","ms","master", "mba","m.tech")
amazon$`BASIC QUALIFICATIONS` <- tolower(amazon$`BASIC QUALIFICATIONS`)
for(i in degree_list){
amazon$deg1 <- grepl(i, amazon$`BASIC QUALIFICATIONS`)
amazon$education <- ifelse(amazon$deg1 == "TRUE", i, amazon$education)
}
amazon$education <- ifelse(is.na(amazon$education), "other", amazon$education)
amazon <- subset(amazon, select = -c(deg1))
edu <- as.data.frame(table(amazon$education))
colnames(edu) <- c("Education", "Freq")
p10 <- ggplot(data = edu, aes(Education, Freq, fill = Education)) +
geom_bar(stat = "identity") +
geom_text(aes(label=Freq), color="black", size=3) +
ggtitle("Jobs based on degrees") +
guides(fill=FALSE) +
coord_flip()
p10

Basic Qualifications generally requires Masters degree.
Which jobs require which type of degrees?


What languages are in demand?
list_of_occurence <- c()
languages <- c('swift','matlab','mongodb','hadoop','cosmos', 'mysql','spark', 'pig', 'python', 'java.', 'java,','c[++]', 'php', 'javascript', 'objective c', 'ruby', 'perl','c ','c#', ' r,')
amazon$`PREFERRED QUALIFICATIONS` <- tolower(amazon$`PREFERRED QUALIFICATIONS`)
for(i in languages){
amazon$dummy <- str_count(amazon$`PREFERRED QUALIFICATIONS`, i)
list_of_occurence <- c(list_of_occurence, sum(amazon$dummy, na.rm = TRUE))
}
list_of_occurence
[1] 26 9 16 164 1 50 128 20 378 1080 392 504 34 348 79 237 188 630 139 7
lan <- data.frame(cbind(languages, as.numeric(list_of_occurence)))
p15 <- ggplot(data = lan, aes(languages, list_of_occurence, fill = languages)) +
geom_bar(stat = "identity") +
geom_text(aes(label=list_of_occurence), color="black", size=3) +
ggtitle("Languages in job description") +
guides(fill=FALSE) +
coord_flip()
p15

Java, C, C++ and python are popular on-demand languages in jobs.
How many years of experience is required?
amazon$dummy <- str_extract(amazon$`BASIC QUALIFICATIONS`, '([0-9]+) year')
amazon$dummy <- ifelse(is.na(amazon$dummy), "0 year", amazon$dummy)
split_year <- function(var){
return(as.numeric(unlist(str_split(var, " "))[1]))
}
amazon$experience <- sapply(amazon$dummy, split_year)
Let’s see the experience, excluding 0 year experience.
exp <- as.data.frame(table(amazon$experience))
colnames(exp) <- c("Experience", "Freq")
p16 <- ggplot(data = filter(exp, exp$Experience != 0), aes(Experience, Freq, fill = Experience)) +
geom_bar(stat = "identity") +
geom_text(aes(label=Freq), color="black", size=3) +
ggtitle("Experience needed") +
guides(fill=FALSE) +
coord_flip()
p16



Word Clouds of Qualifications and Job description
library(wordcloud)
library(SnowballC)
library(RColorBrewer)
library(tm)
texts <- amazon$`PREFERRED QUALIFICATIONS`
#texts <- iconv(texts, to = "utf-8")
corpus <- Corpus(VectorSource(texts))
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, stopwords('english'))
#corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, removeWords, c("00b7","and", "this", "there"))
corpus <- Corpus(VectorSource(corpus))
dtm <- TermDocumentMatrix(corpus)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
d <- d[-which(d$word %in% c("00b7","and","this","that")),]
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
title("Preferred Qualifications Word Cloud")

texts <- amazon$`BASIC QUALIFICATIONS`
#texts <- iconv(texts, to = "utf-8")
corpus <- Corpus(VectorSource(texts))
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, stopwords('english'))
#corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, removeWords, c("00b7","and", "this", "there"))
corpus <- Corpus(VectorSource(corpus))
dtm <- TermDocumentMatrix(corpus)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
d <- d[-which(d$word %in% c("00b7","and","this","that")),]
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
title("Basic Qualifications Word Cloud")


texts <- amazon$Title
#texts <- iconv(texts, to = "utf-8")
corpus <- Corpus(VectorSource(texts))
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, stopwords('english'))
#corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, removeWords, c("00b7","and", "this", "there", "you", "will"))
corpus <- Corpus(VectorSource(corpus))
dtm <- TermDocumentMatrix(corpus)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
#d <- d[-which(d$word %in% c("00b7","and","this","that", "you", "will")),]
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
title("Job Title Word Cloud")

---
title: "Amazon Jobs"
output: html_notebook
---

Let's check out the data!

```{r}
library(readr)
amazon <- read_csv("amazon_jobs_dataset.csv")
head(amazon)
```

Questions that we can focus on:
  - What are the popular locations of job openings?
  - Are there certain months when job opening are more? 
    - How many jobs are posted every year?
    - How many jobs are posted every month?
  - What are most popular job titles?
  - What are minimum educational qualifications?
    - Which job requires higher degree?
  - What languages are required for jobs?
  - How many years of experience is required?
    - Which job categories require more experience?
  - Word clouds of preferred qualifications
  - Word clouds of Basic Qualifications
  - Word clouds of JOb description

```{r}
dim(amazon)
```
```{r}
unique(amazon$Posting_date)[1:20]
```

We see that in some dates, there are multiple spaces between month and day. Let's split it to year month and day.

```{r}
library(stringr)
amazon$posting_month <- str_extract(amazon$Posting_date , "([A-Za-z]+)")
amazon$posting_day <- str_extract(amazon$Posting_date , "([0-9]+)")
amazon$posting_year <- str_extract(amazon$Posting_date, "([0-9]{4})")
```

```{r}
unique(amazon$posting_month)
unique(amazon$posting_day)
unique(amazon$posting_year)
```
Let's also extract the countries from location column.

```{r}
amazon$country <- str_extract(amazon$location, "[A-Za-z]+")
unique(amazon$country)
```

```{r}
unique(amazon$location)
```

```{r}
amazon$city <- word(amazon$location, -1)
unique(amazon$city)
```

Let us correct the city names that have multiple words.
```{r}
amazon$city <- gsub(pattern = "Alto", replacement = "East Palo Alto", amazon$city)
amazon$city <- gsub(pattern = "Ia<U+015F>i", replacement = "Iasi", amazon$city)
amazon$city <- gsub(pattern = "Monica", replacement = "Santa Monica", amazon$city)
amazon$city <- gsub(pattern = "Francisco", replacement = "San Francisco", amazon$city)
amazon$city <- gsub(pattern = "Wroclawskie", replacement = "Bielany Wroclawskie", amazon$city)
amazon$city <- gsub(pattern = "Diego", replacement = "San Diego", amazon$city)
amazon$city <- gsub(pattern = "Barbara", replacement = "Santa Barbara", amazon$city)
amazon$city <- gsub(pattern = "York", replacement = "New York", amazon$city)
amazon$city <- gsub(pattern = "Town", replacement = "Cape Town", amazon$city)
amazon$city <- gsub(pattern = "Cruz", replacement = "Santa Cruz", amazon$city)
amazon$city <- gsub(pattern = "Obispo", replacement = "San Luis Obispo", amazon$city)
amazon$city <- gsub(pattern = "Paulo", replacement = "Sao Paulo", amazon$city)
amazon$city <- gsub(pattern = "Hasharon", replacement = "Hod Hasharon", amazon$city)
amazon$city <- gsub(pattern = "IL,", replacement = "Hod Hasharon", amazon$city)
amazon$city <- gsub(pattern = "On,", replacement = "Ottawa", amazon$city)
amazon$city <- gsub(pattern = "Jose", replacement = "San Jose", amazon$city)
amazon$city <- gsub(pattern = "CA,", replacement = "Ottawa", amazon$city)
amazon$city <- gsub(pattern = "Vegas", replacement = "Las Vegas", amazon$city)
```

The word "City" has been used with three different cities, so include them.

```{r}
amazon$city <- ifelse(amazon$location == "US, CA, Foster City", "Foster City", amazon$city)
amazon$city <- ifelse(amazon$location == "MX, EMEX, Mexico City", "Mexico City", amazon$city)
amazon$city <- ifelse(amazon$location == "US, NJ, Jersey City", "Jersey City", amazon$city)
```

```{r}
head(amazon)
```

# What are most popular locations of Job Openings?

```{r}
library(dplyr)
location <- amazon[, c("country", "city")]
loc1 <- as.data.frame(table(location$city))
colnames(loc1) <- c("city", "Freq")

loc1 <- merge(loc1,location,by="city")
loc1 <- arrange(loc1, desc(Freq))
loc1 <- loc1 %>% distinct(city, .keep_all = TRUE)
loc1
```


```{r fig.width=5, fig.height=3, echo=FALSE}
library(ggplot2)
library(Rmisc)
tilt_theme <- theme(axis.text.x=element_text(angle=45, hjust=1))
p1 <- ggplot(data = head(loc1, 10), aes(city, Freq, fill = country)) +
      geom_bar(stat = "identity") +
      geom_text(aes(label=Freq), color="black", size=3) +
      ggtitle("Top 10 Job Locations") + 
      coord_flip() +
      tilt_theme 
p1
```

```{r fig.width=5, fig.height=3, echo=FALSE}
p2 <- ggplot(data = tail(loc1, 50), aes(city, Freq, fill = country)) +
      geom_bar(stat = "identity") +
      geom_text(aes(label=Freq), color="black", size=3) +
      ggtitle("Last 50 Job Locations") + 
      coord_flip() +
      tilt_theme 
p2
```

Distribution of Jobs in all countries except US
```{r}
p3 <- ggplot(filter(loc1,loc1$country!="US"), aes(x=factor(country), y=Freq, fill = factor(country))) + 
  geom_boxplot() 
p3
```

Distribution of jobs in US

```{r}
p4 <- ggplot(data = filter(loc1, loc1$country == "US"), aes(city, Freq, fill = city)) +
      geom_bar(stat = "identity") +
      geom_text(aes(label=Freq), color="black", size=3) +
      ggtitle("Jobs in US") + 
      coord_flip() +
      guides(fill=FALSE) +
      tilt_theme 
p4
```

# Analysis of Job Posting Date

## How many jobs are posted every year and every month? Is there a certain period of year when jobs are posted more fequently?

```{r}

amazon$Posting_date <- paste(amazon$posting_year, amazon$posting_month, amazon$posting_day,sep="-")

amazon$Posting_date <- as.Date(strptime(amazon$Posting_date,format="%Y-%b-%d"))
#amazon$Posting_date$zone <- NULL
(amazon$Posting_date[1:10])

#get day of the week based on date
dow <- function(x) format(as.Date(x), "%A")
amazon$weekday <- dow(amazon$Posting_date)
amazon$weekday[1:20]
```

```{r}
posting_date <- amazon[,c("posting_month", "posting_year", "weekday", "Posting_date")]
head(posting_date)
```

```{r}
year <- as.data.frame(table(posting_date$posting_year))
colnames(year) <- c("Year", "Freq")
p5 <- ggplot(data = year, aes(Year, Freq, fill = Year)) +
      geom_bar(stat = "identity") +
      geom_text(aes(label=Freq), color="black", size=3) +
      ggtitle("Jobs postings by Year") + 
      guides(fill=FALSE) +
      tilt_theme 
p5
```

```{r}
mon <- as.data.frame(table(posting_date$posting_month))
colnames(mon) <- c("Mon", "Freq")
lev = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")
mon$Month <- factor(mon$Mon, lev)
 
p6 <- ggplot(data = mon, aes(Month, Freq, fill = Month)) +
      geom_bar(stat = "identity") +
      geom_text(aes(label=Freq), color="black", size=3) +
      ggtitle("Jobs postings by Month") + 
      guides(fill=FALSE) +
      tilt_theme 
p6
```

```{r}
jday <- as.data.frame(table(posting_date$weekday))
colnames(jday) <- c("wd", "Freq")
lev <- c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday")
jday$weekday <- factor(jday$wd, lev)

p7 <- ggplot(data = jday, aes(weekday, Freq, fill = weekday)) +
      geom_bar(stat = "identity") +
      geom_text(aes(label=Freq), color="black", size=3) +
      ggtitle("Jobs postings by Weekday") + 
      guides(fill=FALSE) +
      tilt_theme 
p7
```

The above graphs show that most of the job postings are recent and there has been large increase in jobs in 2017 and 2018. January and february are hot months for jobs. Also, the jobs are posted generally on weekdays, mostly on tuesdays and wednesday. This is true as generally employees are completing more work in the first half of weekdays. 

# Analysis of Job titles

## What are most popular job titles?

```{r}
title <- arrange(as.data.frame(table(amazon$Title)), desc(Freq))
colnames(title) <- c("Title", "Freq")
p8 <- ggplot(data = head(title, 10), aes(Title, Freq, fill = Title)) +
      geom_bar(stat = "identity") +
      geom_text(aes(label=Freq), color="black", size=3) +
      ggtitle("Top 10 jobs") + 
      guides(fill=FALSE) +
      coord_flip() +
      tilt_theme 
p8
```

There are various job postings. Top 10 are shown above. Let us now divide it in different domains and different positions.
```{r}

# software, game, quality, data, web, security, sale,iot, research
# lead, manager, developer, engineer, consultant, artist, analyst, scientist,
domain <- c("software", "game", "quality", "data", "web", "security", "sale", "research", "iot", "ui")
positions <- c("lead", "leader","manager", "developer", "engineer", "consultant", "artist", "analyst", "scientist")
amazon$Title <- tolower(amazon$Title)
#unique(amazon$Title)
amazon$domain <- NA
amazon$positions <- NA

```



```{r}
for(i in domain){
amazon$domain1 <- grepl(i, amazon$Title)
amazon$domain <- ifelse(amazon$domain1 == "TRUE", i, amazon$domain)
}
amazon$domain <- ifelse(amazon$domain == FALSE, "other", amazon$domain)

for(i in positions){
  amazon$pos1 <- grepl(i, amazon$Title)
  amazon$positions <- ifelse(amazon$pos1 == "TRUE", i, amazon$positions)
}
amazon$positions <- ifelse(is.na(amazon$positions), "other", amazon$positions)
amazon <- subset(amazon, select = -c(domain1, pos1))

```

```{r}
pos <- as.data.frame(table(amazon$positions))
colnames(pos) <- c("position", "Freq")
p9 <- ggplot(data = pos, aes(position, Freq, fill = position)) +
      geom_bar(stat = "identity") +
      geom_text(aes(label=Freq), color="black", size=3) +
      ggtitle("Top 10 domains") + 
      guides(fill=FALSE) +
      coord_flip() 
p9
```

```{r}
dom <- as.data.frame(table(amazon$domain))
colnames(dom) <- c("domain", "Freq")
p9 <- ggplot(data = dom, aes(domain, Freq, fill = domain)) +
      geom_bar(stat = "identity") +
      geom_text(aes(label=Freq), color="black", size=3) +
      ggtitle("Top 10 domains") + 
      guides(fill=FALSE) +
      coord_flip() 
p9
```

# What are minimum educational qualifications?

```{r}
amazon$education <- NA
degree_list = c("ba", "bs", "b.tech","bachelor", "phd","ms","master", "mba","m.tech")
amazon$`BASIC QUALIFICATIONS` <- tolower(amazon$`BASIC QUALIFICATIONS`)

for(i in degree_list){
  amazon$deg1 <- grepl(i, amazon$`BASIC QUALIFICATIONS`)
  amazon$education <- ifelse(amazon$deg1 == "TRUE", i, amazon$education)
}
amazon$education <- ifelse(is.na(amazon$education), "other", amazon$education)
amazon <- subset(amazon, select = -c(deg1))

```

```{r}
edu <- as.data.frame(table(amazon$education))
colnames(edu) <- c("Education", "Freq")
p10 <- ggplot(data = edu, aes(Education, Freq, fill = Education)) +
      geom_bar(stat = "identity") +
      geom_text(aes(label=Freq), color="black", size=3) +
      ggtitle("Jobs based on degrees") + 
      guides(fill=FALSE) +
      coord_flip() 
p10
```

Basic Qualifications generally requires Masters degree.

## Which jobs require which type of degrees?


```{r fig.width=7, fig.height=5, echo=FALSE}
counts <- as.data.frame(table(amazon$domain, amazon$education))
colnames(counts) <- c("Domain", "Education", "Freq")
p11 <- ggplot(data = filter(counts, counts$Domain != "software"), aes(Domain, Freq)) +
      geom_bar(aes(fill = Education), position = "dodge",stat = "identity") +
      ggtitle("Domain distribution based on Education") 

p12 <- ggplot(data = filter(counts, counts$Domain == "software"), aes(Domain, Freq)) +
      geom_bar(aes(fill = Education), position = "dodge",stat = "identity") +
      ggtitle("Domain distribution based on Education") 
multiplot(p11, p12)
```

```{r fig.width=7, fig.height=5, echo=FALSE}
counts <- as.data.frame(table(amazon$positions, amazon$education))
colnames(counts) <- c("Positions", "Education", "Freq")
p13 <- ggplot(data = filter(counts, counts$Positions != "engineer"), aes(Positions, Freq)) +
      geom_bar(aes(fill = Education), position = "dodge",stat = "identity") +
      ggtitle("Positions distribution based on Education") 

p14 <- ggplot(data = filter(counts, counts$Positions == "engineer"), aes(Positions, Freq)) +
      geom_bar(aes(fill = Education), position = "dodge",stat = "identity") +
      ggtitle("Positions distribution based on Education") 
multiplot(p13, p14)
```

# What languages are in demand?

```{r}
list_of_occurence <- c()
languages <- c('swift','matlab','mongodb','hadoop','cosmos', 'mysql','spark', 'pig', 'python', 'java.', 'java,','c[++]', 'php', 'javascript', 'objective c', 'ruby', 'perl','c ','c#', ' r,')
amazon$`PREFERRED QUALIFICATIONS` <- tolower(amazon$`PREFERRED QUALIFICATIONS`)

for(i in languages){
  amazon$dummy <- str_count(amazon$`PREFERRED QUALIFICATIONS`, i)
  list_of_occurence <- c(list_of_occurence, sum(amazon$dummy, na.rm = TRUE))
}
list_of_occurence
```

```{r}
lan <- data.frame(cbind(languages, as.numeric(list_of_occurence)))
p15 <- ggplot(data = lan, aes(languages, list_of_occurence, fill = languages)) +
      geom_bar(stat = "identity") +
      geom_text(aes(label=list_of_occurence), color="black", size=3) +
      ggtitle("Languages in job description") + 
      guides(fill=FALSE) +
      coord_flip() 
p15
```

Java, C, C++ and python are popular on-demand languages in jobs.


# How many years of experience is required?

```{r}
amazon$dummy <- str_extract(amazon$`BASIC QUALIFICATIONS`, '([0-9]+) year')
amazon$dummy <- ifelse(is.na(amazon$dummy), "0 year", amazon$dummy)

split_year <- function(var){
 return(as.numeric(unlist(str_split(var, " "))[1]))
}
amazon$experience <- sapply(amazon$dummy, split_year)

```

Let's see the experience, excluding 0 year experience.
```{r}
exp <- as.data.frame(table(amazon$experience))
colnames(exp) <- c("Experience", "Freq")
p16 <- ggplot(data = filter(exp, exp$Experience != 0), aes(Experience, Freq, fill = Experience)) +
      geom_bar(stat = "identity") +
      geom_text(aes(label=Freq), color="black", size=3) +
      ggtitle("Experience needed") + 
      guides(fill=FALSE) +
      coord_flip() 
p16
```

```{r fig.width=7, fig.height=5, echo=FALSE}
counts <- as.data.frame(table(amazon$positions, amazon$experience))
colnames(counts) <- c("Positions", "Experience", "Freq")
p17 <- ggplot(data = filter(counts, counts$Experience != 0 & counts$Positions != "engineer"), aes(Positions, Freq)) +
      geom_bar(aes(fill = Experience), position = "dodge",stat = "identity") +
      ggtitle("Positions distribution based on Experience for positions excluding enginner") 
p18 <- ggplot(data = filter(counts, counts$Experience == 0 & counts$Positions != "engineer"), aes(Positions, Freq)) +
      geom_bar(aes(fill = Experience), position = "dodge",stat = "identity") +
      ggtitle("Positions where Experience is not defined excluding engineer") 
p19 <- ggplot(data = filter(counts, counts$Positions == "engineer"), aes(Positions, Freq)) +
      geom_bar(aes(fill = Experience), position = "dodge",stat = "identity") +
      ggtitle("Engineer Positions distribution based on Experience") 
p20 <- ggplot(data = filter(counts, counts$Experience != 0 & counts$Positions == "engineer"), aes(Positions, Freq)) +
      geom_bar(aes(fill = Experience), position = "dodge",stat = "identity") +
      ggtitle("Engineer Positions where Experience is not zero") 
multiplot(p17, p18, p19, p20, layout = matrix(c(1,2,3,4), nrow=2, byrow=TRUE))

```

```{r fig.width=7, fig.height=5, echo=FALSE}
counts <- as.data.frame(table(amazon$domain, amazon$experience))
colnames(counts) <- c("Domain", "Experience", "Freq")
p21 <- ggplot(data = filter(counts, counts$Experience != 0 & counts$Domain != "software"), aes(Domain, Freq)) +
      geom_bar(aes(fill = Experience), position = "dodge",stat = "identity") +
      ggtitle("Domain distribution based on Experience for domain excluding software") 

p22 <- ggplot(data = filter(counts, counts$Experience == 0 & counts$Domain != "software"), aes(Domain, Freq)) +
      geom_bar(aes(fill = Experience), position = "dodge",stat = "identity") +
      ggtitle("Domain where Experience is not defined for domain excluding software") 

p23 <- ggplot(data = filter(counts, counts$Domain == "software"), aes(Domain, Freq)) +
      geom_bar(aes(fill = Experience), position = "dodge",stat = "identity") +
      ggtitle("Domain distribution based on Experience for software") 

p24 <- ggplot(data = filter(counts, counts$Experience != 0 & counts$Domain == "software"), aes(Domain, Freq)) +
      geom_bar(aes(fill = Experience), position = "dodge",stat = "identity") +
      ggtitle("Domain where Experience is not zero for software") 

multiplot(p21, p22, p23, p24, layout = matrix(c(1,2,3,4), nrow=2, byrow=TRUE))
```

# Word Clouds of Qualifications and Job description

```{r}
library(wordcloud)
library(SnowballC)
library(RColorBrewer)
library(tm)
texts <- amazon$`PREFERRED QUALIFICATIONS`
#texts <- iconv(texts, to = "utf-8")
corpus <- Corpus(VectorSource(texts))
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, stopwords('english'))
#corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, removeWords, c("00b7","and", "this", "there")) 
corpus <- Corpus(VectorSource(corpus))
dtm <- TermDocumentMatrix(corpus)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
d <- d[-which(d$word %in% c("00b7","and","this","that")),]
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))
title("Preferred Qualifications Word Cloud")
```

```{r}
texts <- amazon$`BASIC QUALIFICATIONS`
#texts <- iconv(texts, to = "utf-8")
corpus <- Corpus(VectorSource(texts))
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, stopwords('english'))
#corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, removeWords, c("00b7","and", "this", "there")) 
corpus <- Corpus(VectorSource(corpus))
dtm <- TermDocumentMatrix(corpus)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
d <- d[-which(d$word %in% c("00b7","and","this","that")),]
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))
title("Basic Qualifications Word Cloud")
```

```{r  fig.width=5, fig.height=5, echo=FALSE}
texts <- amazon$DESCRIPTION
#texts <- iconv(texts, to = "utf-8")
corpus <- Corpus(VectorSource(texts))
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, stopwords('english'))
#corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, removeWords, c("00b7","and", "this", "there", "you", "will")) 
corpus <- Corpus(VectorSource(corpus))
dtm <- TermDocumentMatrix(corpus)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
d <- d[-which(d$word %in% c("00b7","and","this","that", "you", "will")),]
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))
title("Job Description Word Cloud")
```

```{r}
texts <- amazon$Title
#texts <- iconv(texts, to = "utf-8")
corpus <- Corpus(VectorSource(texts))
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, stopwords('english'))
#corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, removeWords, c("00b7","and", "this", "there", "you", "will")) 
corpus <- Corpus(VectorSource(corpus))
dtm <- TermDocumentMatrix(corpus)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
#d <- d[-which(d$word %in% c("00b7","and","this","that", "you", "will")),]
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))
title("Job Title Word Cloud")
```


